import urllib.request
import os.path
import re
from bs4 import BeautifulSoup as bfs

base_url = 'http://www.23us.so/list/'

headers = {  # 伪装浏览器
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/32.0.1700.76 Safari/537.36'
}

page_idx_pattern = re.compile('<em id="pagestats">.*?</em>', re.S)

def get_type_url(type_index):
    return base_url + str(type_index) + '_1.html'

def get_one_page(type_index, page_index):
    return base_url + str(type_index) + '_' + str(page_index) + '.html'

def get_type_page_content(url):
    req = urllib.request.Request(url, headers=headers)
    data = urllib.request.urlopen(req).read().decode()
    return data


def download_motion(motion):
    name = motion['name']
    author = motion['author']
    info_url = motion['info_url']
    detail_url = motion['detail_url']
    if not name or not author or not info_url or detail_url:
        return


for i in range(9):
    idx = i + 1

    url = get_type_url(idx)
    data = get_type_page_content(url)
    pages_list = re.findall(page_idx_pattern, data)

    page = bfs(data, 'html.parser')
    pages_count = str(page.em.string)

    if not pages_count:
        pass #continue

    all_pages_list = pages_count.split('/')
    if not all_pages_list or len(all_pages_list) < 2:
        pass
    all_page = int(all_pages_list[1])
    print(all_page)

    for j in range(all_page):
        page_idx = j + 1
        page_url = get_one_page(idx, page_idx)
        data = get_type_page_content(page_url)
        page = bfs(data, 'html.parser')
        all_tr = page.select('tr')
        for tr in all_tr:
            td_list = tr.select('td')
            if len(td_list) <= 0:
                continue
            count = 0
            motion = {}
            for item in td_list:
                count += 1
                if count == 4:
                    break
                if count == 1:
                    a_list = item.select('a')
                    if len(a_list) <= 0:
                        break
                    # print(a_list[0].string)
                    # print(a_list[0].attrs['href'])
                    motion['name'] = a_list[0].string
                    motion['info_url'] = a_list[0].attrs['href']
                if count == 2:
                    a_list = item.select('a')
                    if len(a_list) <= 0:
                        break
                    # print(a_list[0].attrs['href'])
                    motion['detail_url'] = a_list[0].attrs['href']
                if count == 3:
                    # print(item.string)
                    motion['author'] = item.string

            print(motion)
            download_motion(motion)
        break
    break

